### Get most diversed gene list
dir_in_res = '../out/20.0216 feat/reg_rf_boruta'
dir_in_anlyz = file.path(dir_in_res, 'anlyz_filtered')
f_featsumm = file.path(dir_in_anlyz,'agg_summary_filtered.csv')
df_aggRes = read.csv(f_featsumm) #aggregated feat summary
div_genes = df_aggRes[,1]
#install.packages("gprofiler2")
library(gprofiler2)
### g:profiler query for the R result
gostres = gost(query = div_genes, organism = "hsapiens", ordered_query = FALSE,
multi_query = FALSE, significant = TRUE, exclude_iea = FALSE,
measure_underrepresentation = FALSE, evcodes = TRUE,
user_threshold = 0.05, correction_method = "g_SCS",
domain_scope = "annotated", custom_bg = NULL,
numeric_ns = "", sources = NULL)
# g:profiler query for the R result --generate link and show result on webpage
# gostres = gost(query = div_genes, organism = "hsapiens", ordered_query = FALSE,
# multi_query = FALSE, significant = TRUE, exclude_iea = FALSE,
# measure_underrepresentation = FALSE, evcodes = FALSE,
# user_threshold = 0.05, correction_method = "g_SCS",
# domain_scope = "annotated", custom_bg = NULL,
# numeric_ns = "", sources = NULL,as_short_link = TRUE)
Interactive plot showing gene description
### visulization capped interaction plot (p>16)
# p.capped = gostplot(gostres, capped =TRUE, interactive = T)
### visulization uncapped interaction plot
p.uncapped = gostplot(gostres, capped = FALSE, interactive = T)
p.uncapped
4 out of top5 GO:BP terms are related to cell cycle. These genes include cyclinD gene, cyclin-dependent kinase gene and cell cycle check point.
### Get GO:BP dataframe
p = gostplot(gostres, capped =FALSE, interactive = F)
res.gobp = gostres$result[grep('GO:BP',gostres$result$source),]
### Show data frame -- intersection shows the genes under that term
# cc.genes = res.gobp$intersection[1:5]
### Table only
# publish_gosttable(gostres, res.gobp[c(1:5),],use_colors = T,
# show_columns = c("source", "term_id", "term_name","term_size","intersection_size"),filename = NULL)
# Top5 plot and table
p.gobp = publish_gostplot(p, res.gobp[c(1:5),],filename = NULL)
## The input 'highlight_terms' is a data.frame and therefore the column 'term_id' will be used for detection.
Some top terms are under GO:CC(cellular component). List top significant terms. However, most of the top terms have large term size, so it’s not surprising that most genes are mapped.
### Get GO:CC dataframe
res.gocc = gostres$result[grep('GO:CC',gostres$result$source),]
### GO:CC result table only- TOP15
tab.gocc = publish_gosttable(gostres, res.gocc[c(1:15),],use_colors = T,
show_columns = c("source", "term_id", "term_name","term_size","intersection_size"),filename = NULL)
## The input 'highlight_terms' is a data.frame. The column 'term_id' will be used.
### GO:CC nucleous related
# Get dataframe
# res.gocc.nuc = res.gocc[grep('nuc', res.gocc$term_name),]
# table only
# publish_gosttable(gostres, res.gocc.nuc ,use_colors = T,
# show_columns = c("source", "term_id", "term_name","term_size","intersection_size"),filename = NULL)
# plot and table
# p.gocc.nuc = publish_gostplot(p, res.gocc.nuc,filename = NULL)
### GO:CC lumen related
#get dataframe
res.gocc.lum = res.gocc[grep('lumen', res.gocc$term_name),]
# table only
# publish_gosttable(gostres, res.gocc.lum ,use_colors = T,
# show_columns = c("source", "term_id", "term_name","term_size","intersection_size"),filename = NULL)
# plot and table
p.gocc.lum = publish_gostplot(p, res.gocc.lum,filename = NULL)
## The input 'highlight_terms' is a data.frame and therefore the column 'term_id' will be used for detection.
Catalytic function related terms are also enriched
p.cat = publish_gostplot(p, highlight_terms = c("GO:0032991","GO:0003824","GO:1902494"),filename = NULL)
miRNA that associate with cancer is highly significant
p.mirna=publish_gostplot(p, highlight_terms = c("MIRNA:hsa-miR-16-5p"),filename = NULL)
# get g:profiler version --Please use "set_base_url" and switch to this version for later users
gostres$meta$version
## [1] "e100_eg47_p14_7733820"